/***
### Step 5: Create Supply Chain Network
**File:** `1_1_SC_Network.do`  
**Language:** Stata  
**Description:** Constructs the Supply Chain Network by combining Compustat, Factset, and VTNIC data.
- **Inputs:**
  - compustat_supplychain_update20210209.dta (Compustat segment data)
  - factset_update20210209.dta (Factset edgelist of SC partners)
  - Sales_cogs_85_2019.dta (Compustat data on sales and Cost of goods sold)
  - VTNIC_89_2019 (From Fresard, Hoberg, and Phillips. See: https://faculty.marshall.usc.edu/Gerard-Hoberg/FresardHobergPhillipsDataSite/idata/VertNetwork_10gran.zip)
- **Outputs:**
  - SCnetwork2.dta
  - SCnetwork2_saveold.dta (Same as SCnetwork2 but in old Stata format, which SAS can import)
  - SCnetwork2.csv (for use in Matlab and Python)

***/

global dir your_path
cd "$dir"

//-------------------------Compustat Section---------------------------------
//-------------------------Compustat Section---------------------------------
//-------------------------Compustat Section---------------------------------
//-------------------------Compustat Section---------------------------------
//-------------------------Compustat Section---------------------------------

use "compustat_supplychain_update20210209.dta", clear
replace score = 1 if score > 1 & !missing(score)
//double check that gvkey1 is ALWAYS customer
replace supplier = 1 if missing(supplier)
rename score rawscore
egen pairid = group(gvkey1 gvkey2)
xtset pairid fyear
tsfill, full
duplicates drop pairid fyear, force
bysort pairid : egen nfilled = count(gvkey1) 
sum nfilled
global max = r(max)
destring gvkey1, replace
destring gvkey2, replace
forvalues i = 1/$max {
	replace gvkey1 = F`i'.gvkey1 if missing(gvkey1) 
	replace gvkey1 = L`i'.gvkey1 if missing(gvkey1) 
	replace gvkey2 = F`i'.gvkey2 if missing(gvkey2)
	replace gvkey2 = L`i'.gvkey2 if missing(gvkey2)
}
gen missingscore = missing(rawscore)
replace rawscore = 0 if rawscore < 0 
bysort pairid: egen minscore = min(rawscore)
replace rawscore = minscore if missing(rawscore)
replace rawscore = 0 if missing(rawscore)
gen score = (rawscore*.5) + .5
replace score = .5 if missing(score)
drop pairid
drop minscore
drop nfilled
//Need to triple check that supplier flag is defined appropriately. 
replace supplier = 1 if missing(supplier) 
//gen source = 1
gen original = 1
save "compustatpairs.dta", replace

use "compustatpairs.dta", clear
rename gvkey1 gvkey
rename gvkey2 gvkey1
rename gvkey gvkey2
replace supplier = 1-supplier
replace original = 0 
append using "compustatpairs.dta"

duplicates tag gvkey1 gvkey2 fyear, gen(tag)
tab tag
keep if tag == 0 | original == 1
drop original 
order gvkey1 gvkey2
drop tag
replace source = 1 if missing(source)
save "compustatpairs.dta", replace

use "Sales_cogs_85_2019.dta", clear
destring gvkey, replace
rename gvkey gvkey1
rename cogs cogs1 
rename sale sales1
merge 1:m gvkey1 fyear using "compustatpairs.dta"
keep if _merge == 3
drop _merge 
save "compSCdirected.dta", replace

use "Sales_cogs_85_2019.dta", clear
destring gvkey, replace
rename gvkey gvkey2
rename cogs cogs2 
rename sale sales2
merge 1:m gvkey2 fyear using "compSCdirected.dta"
keep if _merge == 3
drop _merge 
order gvkey1 gvkey2 fyear

winsor2 cogs1 cogs2 sales1 sales2, replace
gen rev = rawscore*sales2 if supplier == 0 
gen cogsscore = rev/cogs1 if supplier == 0
replace cogsscore = 1 if cogsscore > 1 & !missing(cogsscore)

replace rawscore = cogsscore if supplier == 0
replace missingscore = 0 if supplier == 0 & !missing(rawscore)
egen pairid = group(gvkey1 gvkey2)
replace missingscore = 1 if missing(rawscore)
bysort pairid: egen minscore = min(cogsscore)
replace rawscore = minscore if missing(rawscore) & supplier == 0 
replace rawscore = 0 if missing(rawscore) & supplier == 0
replace score = (rawscore*.5) + .5 if supplier == 0
replace score = .5 if supplier == 0 & (score < .5 | missing(score))
keep gvkey1 gvkey2 fyear rawscore source score supplier missingscore
save "compSCdirected.dta", replace
capture erase "compustatpairs.dta"


//-------------------------Factset Section---------------------------------
//-------------------------Factset Section---------------------------------
//-------------------------Factset Section---------------------------------
//-------------------------Factset Section---------------------------------
//-------------------------Factset Section---------------------------------

use "factset_update20210209.dta", clear
replace score = 1 if score > 1 & !missing(score)
//double check that gvkey1 is ALWAYS customer
replace supplier = 1 if missing(supplier)
rename score rawscore
egen pairid = group(gvkey1 gvkey2)
duplicates tag pairid fyear, gen(tag) 
keep if !missing(rawscore) | tag < 1 
drop tag
duplicates drop pairid fyear, force
xtset pairid fyear
tsfill, full
bysort pairid : egen nfilled = count(gvkey1) 
sum nfilled
global max = r(max)
destring gvkey1, replace
destring gvkey2, replace
forvalues i = 1/$max {
	replace gvkey1 = F`i'.gvkey1 if missing(gvkey1) 
	replace gvkey1 = L`i'.gvkey1 if missing(gvkey1) 
	replace gvkey2 = F`i'.gvkey2 if missing(gvkey2)
	replace gvkey2 = L`i'.gvkey2 if missing(gvkey2)
}
gen missingscore = missing(rawscore)
replace rawscore = 0 if rawscore < 0 
bysort pairid: egen minscore = min(rawscore)
replace rawscore = minscore if missing(rawscore)
replace rawscore = 0 if missing(rawscore)
gen score = (rawscore*.5) + .5
replace score = .5 if missing(score)
drop pairid
drop minscore
drop nfilled
//Need to triple check that supplier flag is defined appropriately. 
replace supplier = 1 if missing(supplier) 
replace source = 2
gen original = 1
save "factsetpairs.dta", replace

use "factsetpairs.dta", clear
rename gvkey1 gvkey
rename gvkey2 gvkey1
rename gvkey gvkey2
replace supplier = 1-supplier
replace original = 0 
append using "factsetpairs.dta"

duplicates tag gvkey1 gvkey2 fyear, gen(tag)
tab tag
keep if tag == 0 | original == 1
drop original 
order gvkey1 gvkey2
drop tag
replace source = 2 if missing(source)
save "factsetpairs.dta", replace

use "Sales_cogs_85_2019.dta", clear
destring gvkey, replace
rename gvkey gvkey1
rename cogs cogs1 
rename sale sales1
merge 1:m gvkey1 fyear using "factsetpairs.dta"
keep if _merge == 3
drop _merge 
save "factsetdirected.dta", replace

use "Sales_cogs_85_2019.dta", clear
destring gvkey, replace
rename gvkey gvkey2
rename cogs cogs2 
rename sale sales2
merge 1:m gvkey2 fyear using "factsetdirected.dta"
keep if _merge == 3
drop _merge 
order gvkey1 gvkey2 fyear

winsor2 cogs1 cogs2 sales1 sales2, replace
gen rev = rawscore*sales2 if supplier == 0 
gen cogsscore = rev/cogs1 if supplier == 0
replace cogsscore = 1 if cogsscore > 1 & !missing(cogsscore)

replace rawscore = cogsscore if supplier == 0
replace missingscore = 0 if supplier == 0 & !missing(rawscore)
egen pairid = group(gvkey1 gvkey2)
replace missingscore = 1 if missing(rawscore)
bysort pairid: egen minscore = min(cogsscore)
replace rawscore = minscore if missing(rawscore) & supplier == 0 
replace rawscore = 0 if missing(rawscore) & supplier == 0
replace score = (rawscore*.5) + .5 if supplier == 0
replace score = .5 if supplier == 0 & (score < .5 | missing(score))
keep gvkey1 gvkey2 fyear rawscore source score supplier missingscore
save "factsetdirected.dta", replace

capture erase "factsetpairs.dta"

//------------------------- VTNIC Section ---------------------------------
//------------------------- VTNIC Section ---------------------------------
//------------------------- VTNIC Section ---------------------------------
//------------------------- VTNIC Section ---------------------------------
//------------------------- VTNIC Section ---------------------------------

use "VTNIC_89_2019.dta", clear
*** Remove own links
drop if gvkey1 == gvkey2
capture drop supplier
gen supplier = 1
capture drop source 
gen source = 3
rename score rawscore

*** Keep the highest 20 neighbors for each firm/year
gen negscore = -rawscore
sort fyear gvkey1 negscore
bysort fyear gvkey1: gen mycount = _n
keep if mycount <= 20
save "VTNIC2.dta", replace


use "VTNIC_89_2019.dta", clear
*** Remove own links
drop if gvkey1 == gvkey2
capture drop supplier
gen supplier = 1
capture drop source 
gen source = 3
rename score rawscore

rename gvkey1 gvkey 
rename gvkey2 gvkey1
rename gvkey gvkey2 
replace supplier = 0 
gen negscore = -rawscore
sort fyear gvkey1 negscore
bysort fyear gvkey1: gen mycount = _n
keep if mycount <= 20
append using "VTNIC2.dta" 

sum rawscore, d
gen score = (rawscore/r(max))*.09 + .01
save "VTNIC2.dta", replace




//------------------------- Append Comp, Factset, and VTNIC ---------------------------------
//------------------------- Append Comp, Factset, and VTNIC ---------------------------------
//------------------------- Append Comp, Factset, and VTNIC ---------------------------------
//------------------------- Append Comp, Factset, and VTNIC ---------------------------------
//------------------------- Append Comp, Factset, and VTNIC ---------------------------------

use "factsetdirected.dta", clear
append using "compSCdirected.dta"
append using "VTNIC2.dta"
egen pairid = group(gvkey1 gvkey2)
duplicates tag pairid fyear, gen(tag) 
gen keep = 5 if tag == 0
replace keep = 4 if tag > 0 & source == 1 & missingscore == 0
replace keep = 3 if tag > 0 & source == 2 & missingscore == 0
replace keep = 2 if tag > 0 & source == 1 & missingscore == 1
replace keep = 1 if tag > 0 & source == 2 & missingscore == 1
replace keep = 0 if tag > 0 & source == 3
bysort pairid fyear: egen maxkeep = max(keep) 
keep if keep == maxkeep
drop tag keep maxkeep
duplicates drop pairid fyear, force
replace missingscore = 0 if source == 3
save "SCnetwork2.dta", replace

export delimited using "SCnetwork2.csv", replace


//------------------------- SAVE NETWORK FOR SAS use ---------------------------------
use "SCnetwork2.dta", clear

saveold "$dir//SCnetwork2_saveold.dta", replace

